import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
pd.set_option('display.max_row', 111)
pd.set_option('display.max_column', 111)
file = pd.read_excel("dataset.xlsx")
file.head()
| Patient ID | Patient age quantile | SARS-Cov-2 exam result | Patient addmited to regular ward (1=yes, 0=no) | Patient addmited to semi-intensive unit (1=yes, 0=no) | Patient addmited to intensive care unit (1=yes, 0=no) | Hematocrit | Hemoglobin | Platelets | Mean platelet volume | Red blood Cells | Lymphocytes | Mean corpuscular hemoglobin concentration (MCHC) | Leukocytes | Basophils | Mean corpuscular hemoglobin (MCH) | Eosinophils | Mean corpuscular volume (MCV) | Monocytes | Red blood cell distribution width (RDW) | Serum Glucose | Respiratory Syncytial Virus | Influenza A | Influenza B | Parainfluenza 1 | CoronavirusNL63 | Rhinovirus/Enterovirus | Mycoplasma pneumoniae | Coronavirus HKU1 | Parainfluenza 3 | Chlamydophila pneumoniae | Adenovirus | Parainfluenza 4 | Coronavirus229E | CoronavirusOC43 | Inf A H1N1 2009 | Bordetella pertussis | Metapneumovirus | Parainfluenza 2 | Neutrophils | Urea | Proteina C reativa mg/dL | Creatinine | Potassium | Sodium | Influenza B, rapid test | Influenza A, rapid test | Alanine transaminase | Aspartate transaminase | Gamma-glutamyltransferase | Total Bilirubin | Direct Bilirubin | Indirect Bilirubin | Alkaline phosphatase | Ionized calcium | Strepto A | Magnesium | pCO2 (venous blood gas analysis) | Hb saturation (venous blood gas analysis) | Base excess (venous blood gas analysis) | pO2 (venous blood gas analysis) | Fio2 (venous blood gas analysis) | Total CO2 (venous blood gas analysis) | pH (venous blood gas analysis) | HCO3 (venous blood gas analysis) | Rods # | Segmented | Promyelocytes | Metamyelocytes | Myelocytes | Myeloblasts | Urine - Esterase | Urine - Aspect | Urine - pH | Urine - Hemoglobin | Urine - Bile pigments | Urine - Ketone Bodies | Urine - Nitrite | Urine - Density | Urine - Urobilinogen | Urine - Protein | Urine - Sugar | Urine - Leukocytes | Urine - Crystals | Urine - Red blood cells | Urine - Hyaline cylinders | Urine - Granular cylinders | Urine - Yeasts | Urine - Color | Partial thromboplastin time (PTT) | Relationship (Patient/Normal) | International normalized ratio (INR) | Lactic Dehydrogenase | Prothrombin time (PT), Activity | Vitamin B12 | Creatine phosphokinase (CPK) | Ferritin | Arterial Lactic Acid | Lipase dosage | D-Dimer | Albumin | Hb saturation (arterial blood gases) | pCO2 (arterial blood gas analysis) | Base excess (arterial blood gas analysis) | pH (arterial blood gas analysis) | Total CO2 (arterial blood gas analysis) | HCO3 (arterial blood gas analysis) | pO2 (arterial blood gas analysis) | Arteiral Fio2 | Phosphor | ctO2 (arterial blood gas analysis) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 44477f75e8169d2 | 13 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 126e9dd13932f68 | 17 | negative | 0 | 0 | 0 | 0.236515 | -0.02234 | -0.517413 | 0.010677 | 0.102004 | 0.318366 | -0.95079 | -0.09461 | -0.223767 | -0.292269 | 1.482158 | 0.166192 | 0.357547 | -0.625073 | -0.140648 | not_detected | not_detected | not_detected | not_detected | not_detected | detected | NaN | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | -0.619086 | 1.198059 | -0.147895 | 2.089928 | -0.305787 | 0.862512 | negative | negative | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | a46b4402a0e5696 | 8 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | f7d619a94f97c45 | 5 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | d9e41465789c2b5 | 15 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | not_detected | not_detected | not_detected | not_detected | not_detected | detected | NaN | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
df = file.copy()
Nous avons 5644 observations et 111 variables explicatives dans notre dataset
df.shape
(5644, 111)
Nous avons 74 variables quantitatives et 37 variables qualitatives
df.dtypes.value_counts()
float64 70 object 37 int64 4 dtype: int64
-Nous avons decider de visualiser notre dataset en entier avec la fonction heatmap
-Après avoir mesurer le pourcentage de valeurs manquantes nous avons decider (puisqu'on a beaucoup de variables manquantes) d'eliminer toutes les variables dont le pourcentage de données manquantes est superieur à 90% ce sont des colonnes inutiles.
-Après cette operation on est passé de 111 colonnes à 39 colonnes.
-Nous avons decidé aussi d'eliminer la colonne "Patient ID" car cette colonne est inutile elle ne contient que l'ID des patients ce qui nous importe peu voire pas du tout.
missing_mask = df.isnull()
total_missing = missing_mask.sum().sum()
total_cells = np.product(df.shape)
global_missing_percentage = (total_missing / total_cells) * 100
print(global_missing_percentage)
88.06003026414082
df.isna()
| Patient ID | Patient age quantile | SARS-Cov-2 exam result | Patient addmited to regular ward (1=yes, 0=no) | Patient addmited to semi-intensive unit (1=yes, 0=no) | Patient addmited to intensive care unit (1=yes, 0=no) | Hematocrit | Hemoglobin | Platelets | Mean platelet volume | Red blood Cells | Lymphocytes | Mean corpuscular hemoglobin concentration (MCHC) | Leukocytes | Basophils | Mean corpuscular hemoglobin (MCH) | Eosinophils | Mean corpuscular volume (MCV) | Monocytes | Red blood cell distribution width (RDW) | Serum Glucose | Respiratory Syncytial Virus | Influenza A | Influenza B | Parainfluenza 1 | CoronavirusNL63 | Rhinovirus/Enterovirus | Mycoplasma pneumoniae | Coronavirus HKU1 | Parainfluenza 3 | Chlamydophila pneumoniae | Adenovirus | Parainfluenza 4 | Coronavirus229E | CoronavirusOC43 | Inf A H1N1 2009 | Bordetella pertussis | Metapneumovirus | Parainfluenza 2 | Neutrophils | Urea | Proteina C reativa mg/dL | Creatinine | Potassium | Sodium | Influenza B, rapid test | Influenza A, rapid test | Alanine transaminase | Aspartate transaminase | Gamma-glutamyltransferase | Total Bilirubin | Direct Bilirubin | Indirect Bilirubin | Alkaline phosphatase | Ionized calcium | Strepto A | Magnesium | pCO2 (venous blood gas analysis) | Hb saturation (venous blood gas analysis) | Base excess (venous blood gas analysis) | pO2 (venous blood gas analysis) | Fio2 (venous blood gas analysis) | Total CO2 (venous blood gas analysis) | pH (venous blood gas analysis) | HCO3 (venous blood gas analysis) | Rods # | Segmented | Promyelocytes | Metamyelocytes | Myelocytes | Myeloblasts | Urine - Esterase | Urine - Aspect | Urine - pH | Urine - Hemoglobin | Urine - Bile pigments | Urine - Ketone Bodies | Urine - Nitrite | Urine - Density | Urine - Urobilinogen | Urine - Protein | Urine - Sugar | Urine - Leukocytes | Urine - Crystals | Urine - Red blood cells | Urine - Hyaline cylinders | Urine - Granular cylinders | Urine - Yeasts | Urine - Color | Partial thromboplastin time (PTT) | Relationship (Patient/Normal) | International normalized ratio (INR) | Lactic Dehydrogenase | Prothrombin time (PT), Activity | Vitamin B12 | Creatine phosphokinase (CPK) | Ferritin | Arterial Lactic Acid | Lipase dosage | D-Dimer | Albumin | Hb saturation (arterial blood gases) | pCO2 (arterial blood gas analysis) | Base excess (arterial blood gas analysis) | pH (arterial blood gas analysis) | Total CO2 (arterial blood gas analysis) | HCO3 (arterial blood gas analysis) | pO2 (arterial blood gas analysis) | Arteiral Fio2 | Phosphor | ctO2 (arterial blood gas analysis) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True |
| 1 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True |
| 2 | False | False | False | False | False | False | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True |
| 3 | False | False | False | False | False | False | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True |
| 4 | False | False | False | False | False | False | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5639 | False | False | False | False | False | False | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True |
| 5640 | False | False | False | False | False | False | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True |
| 5641 | False | False | False | False | False | False | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True |
| 5642 | False | False | False | False | False | False | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | False | False | False | False | False | True | False | False | False | True | False | False | False | False | False | False | False | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True |
| 5643 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | False | False | False | False | False | False | True | True | False | False | False | False | False | False | False | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | False | True | True | False | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True |
5644 rows × 111 columns
df.isna().sum()
Patient ID 0 Patient age quantile 0 SARS-Cov-2 exam result 0 Patient addmited to regular ward (1=yes, 0=no) 0 Patient addmited to semi-intensive unit (1=yes, 0=no) 0 Patient addmited to intensive care unit (1=yes, 0=no) 0 Hematocrit 5041 Hemoglobin 5041 Platelets 5042 Mean platelet volume 5045 Red blood Cells 5042 Lymphocytes 5042 Mean corpuscular hemoglobin concentration (MCHC) 5042 Leukocytes 5042 Basophils 5042 Mean corpuscular hemoglobin (MCH) 5042 Eosinophils 5042 Mean corpuscular volume (MCV) 5042 Monocytes 5043 Red blood cell distribution width (RDW) 5042 Serum Glucose 5436 Respiratory Syncytial Virus 4290 Influenza A 4290 Influenza B 4290 Parainfluenza 1 4292 CoronavirusNL63 4292 Rhinovirus/Enterovirus 4292 Mycoplasma pneumoniae 5644 Coronavirus HKU1 4292 Parainfluenza 3 4292 Chlamydophila pneumoniae 4292 Adenovirus 4292 Parainfluenza 4 4292 Coronavirus229E 4292 CoronavirusOC43 4292 Inf A H1N1 2009 4292 Bordetella pertussis 4292 Metapneumovirus 4292 Parainfluenza 2 4292 Neutrophils 5131 Urea 5247 Proteina C reativa mg/dL 5138 Creatinine 5220 Potassium 5273 Sodium 5274 Influenza B, rapid test 4824 Influenza A, rapid test 4824 Alanine transaminase 5419 Aspartate transaminase 5418 Gamma-glutamyltransferase 5491 Total Bilirubin 5462 Direct Bilirubin 5462 Indirect Bilirubin 5462 Alkaline phosphatase 5500 Ionized calcium 5594 Strepto A 5312 Magnesium 5604 pCO2 (venous blood gas analysis) 5508 Hb saturation (venous blood gas analysis) 5508 Base excess (venous blood gas analysis) 5508 pO2 (venous blood gas analysis) 5508 Fio2 (venous blood gas analysis) 5643 Total CO2 (venous blood gas analysis) 5508 pH (venous blood gas analysis) 5508 HCO3 (venous blood gas analysis) 5508 Rods # 5547 Segmented 5547 Promyelocytes 5547 Metamyelocytes 5547 Myelocytes 5547 Myeloblasts 5547 Urine - Esterase 5584 Urine - Aspect 5574 Urine - pH 5574 Urine - Hemoglobin 5574 Urine - Bile pigments 5574 Urine - Ketone Bodies 5587 Urine - Nitrite 5643 Urine - Density 5574 Urine - Urobilinogen 5575 Urine - Protein 5584 Urine - Sugar 5644 Urine - Leukocytes 5574 Urine - Crystals 5574 Urine - Red blood cells 5574 Urine - Hyaline cylinders 5577 Urine - Granular cylinders 5575 Urine - Yeasts 5574 Urine - Color 5574 Partial thromboplastin time (PTT) 5644 Relationship (Patient/Normal) 5553 International normalized ratio (INR) 5511 Lactic Dehydrogenase 5543 Prothrombin time (PT), Activity 5644 Vitamin B12 5641 Creatine phosphokinase (CPK) 5540 Ferritin 5621 Arterial Lactic Acid 5617 Lipase dosage 5636 D-Dimer 5644 Albumin 5631 Hb saturation (arterial blood gases) 5617 pCO2 (arterial blood gas analysis) 5617 Base excess (arterial blood gas analysis) 5617 pH (arterial blood gas analysis) 5617 Total CO2 (arterial blood gas analysis) 5617 HCO3 (arterial blood gas analysis) 5617 pO2 (arterial blood gas analysis) 5617 Arteiral Fio2 5624 Phosphor 5624 ctO2 (arterial blood gas analysis) 5617 dtype: int64
sns.heatmap(df.isna(),cbar=False)
<AxesSubplot:>
df.isna().sum() / df.shape[0]
Patient ID 0.000000 Patient age quantile 0.000000 SARS-Cov-2 exam result 0.000000 Patient addmited to regular ward (1=yes, 0=no) 0.000000 Patient addmited to semi-intensive unit (1=yes, 0=no) 0.000000 Patient addmited to intensive care unit (1=yes, 0=no) 0.000000 Hematocrit 0.893161 Hemoglobin 0.893161 Platelets 0.893338 Mean platelet volume 0.893870 Red blood Cells 0.893338 Lymphocytes 0.893338 Mean corpuscular hemoglobin concentration (MCHC) 0.893338 Leukocytes 0.893338 Basophils 0.893338 Mean corpuscular hemoglobin (MCH) 0.893338 Eosinophils 0.893338 Mean corpuscular volume (MCV) 0.893338 Monocytes 0.893515 Red blood cell distribution width (RDW) 0.893338 Serum Glucose 0.963147 Respiratory Syncytial Virus 0.760099 Influenza A 0.760099 Influenza B 0.760099 Parainfluenza 1 0.760454 CoronavirusNL63 0.760454 Rhinovirus/Enterovirus 0.760454 Mycoplasma pneumoniae 1.000000 Coronavirus HKU1 0.760454 Parainfluenza 3 0.760454 Chlamydophila pneumoniae 0.760454 Adenovirus 0.760454 Parainfluenza 4 0.760454 Coronavirus229E 0.760454 CoronavirusOC43 0.760454 Inf A H1N1 2009 0.760454 Bordetella pertussis 0.760454 Metapneumovirus 0.760454 Parainfluenza 2 0.760454 Neutrophils 0.909107 Urea 0.929660 Proteina C reativa mg/dL 0.910347 Creatinine 0.924876 Potassium 0.934266 Sodium 0.934444 Influenza B, rapid test 0.854713 Influenza A, rapid test 0.854713 Alanine transaminase 0.960135 Aspartate transaminase 0.959957 Gamma-glutamyltransferase 0.972892 Total Bilirubin 0.967753 Direct Bilirubin 0.967753 Indirect Bilirubin 0.967753 Alkaline phosphatase 0.974486 Ionized calcium 0.991141 Strepto A 0.941176 Magnesium 0.992913 pCO2 (venous blood gas analysis) 0.975904 Hb saturation (venous blood gas analysis) 0.975904 Base excess (venous blood gas analysis) 0.975904 pO2 (venous blood gas analysis) 0.975904 Fio2 (venous blood gas analysis) 0.999823 Total CO2 (venous blood gas analysis) 0.975904 pH (venous blood gas analysis) 0.975904 HCO3 (venous blood gas analysis) 0.975904 Rods # 0.982814 Segmented 0.982814 Promyelocytes 0.982814 Metamyelocytes 0.982814 Myelocytes 0.982814 Myeloblasts 0.982814 Urine - Esterase 0.989369 Urine - Aspect 0.987597 Urine - pH 0.987597 Urine - Hemoglobin 0.987597 Urine - Bile pigments 0.987597 Urine - Ketone Bodies 0.989901 Urine - Nitrite 0.999823 Urine - Density 0.987597 Urine - Urobilinogen 0.987775 Urine - Protein 0.989369 Urine - Sugar 1.000000 Urine - Leukocytes 0.987597 Urine - Crystals 0.987597 Urine - Red blood cells 0.987597 Urine - Hyaline cylinders 0.988129 Urine - Granular cylinders 0.987775 Urine - Yeasts 0.987597 Urine - Color 0.987597 Partial thromboplastin time (PTT) 1.000000 Relationship (Patient/Normal) 0.983877 International normalized ratio (INR) 0.976435 Lactic Dehydrogenase 0.982105 Prothrombin time (PT), Activity 1.000000 Vitamin B12 0.999468 Creatine phosphokinase (CPK) 0.981573 Ferritin 0.995925 Arterial Lactic Acid 0.995216 Lipase dosage 0.998583 D-Dimer 1.000000 Albumin 0.997697 Hb saturation (arterial blood gases) 0.995216 pCO2 (arterial blood gas analysis) 0.995216 Base excess (arterial blood gas analysis) 0.995216 pH (arterial blood gas analysis) 0.995216 Total CO2 (arterial blood gas analysis) 0.995216 HCO3 (arterial blood gas analysis) 0.995216 pO2 (arterial blood gas analysis) 0.995216 Arteiral Fio2 0.996456 Phosphor 0.996456 ctO2 (arterial blood gas analysis) 0.995216 dtype: float64
df = df[df.columns[df.isna().sum()/df.shape[0] < 0.9]]
df.isna().sum()/df.shape[0]
Patient ID 0.000000 Patient age quantile 0.000000 SARS-Cov-2 exam result 0.000000 Patient addmited to regular ward (1=yes, 0=no) 0.000000 Patient addmited to semi-intensive unit (1=yes, 0=no) 0.000000 Patient addmited to intensive care unit (1=yes, 0=no) 0.000000 Hematocrit 0.893161 Hemoglobin 0.893161 Platelets 0.893338 Mean platelet volume 0.893870 Red blood Cells 0.893338 Lymphocytes 0.893338 Mean corpuscular hemoglobin concentration (MCHC) 0.893338 Leukocytes 0.893338 Basophils 0.893338 Mean corpuscular hemoglobin (MCH) 0.893338 Eosinophils 0.893338 Mean corpuscular volume (MCV) 0.893338 Monocytes 0.893515 Red blood cell distribution width (RDW) 0.893338 Respiratory Syncytial Virus 0.760099 Influenza A 0.760099 Influenza B 0.760099 Parainfluenza 1 0.760454 CoronavirusNL63 0.760454 Rhinovirus/Enterovirus 0.760454 Coronavirus HKU1 0.760454 Parainfluenza 3 0.760454 Chlamydophila pneumoniae 0.760454 Adenovirus 0.760454 Parainfluenza 4 0.760454 Coronavirus229E 0.760454 CoronavirusOC43 0.760454 Inf A H1N1 2009 0.760454 Bordetella pertussis 0.760454 Metapneumovirus 0.760454 Parainfluenza 2 0.760454 Influenza B, rapid test 0.854713 Influenza A, rapid test 0.854713 dtype: float64
sns.heatmap(df.isna(),cbar=False)
<AxesSubplot:>
df=df.drop('Patient ID',axis=1)
df
| Patient age quantile | SARS-Cov-2 exam result | Patient addmited to regular ward (1=yes, 0=no) | Patient addmited to semi-intensive unit (1=yes, 0=no) | Patient addmited to intensive care unit (1=yes, 0=no) | Hematocrit | Hemoglobin | Platelets | Mean platelet volume | Red blood Cells | Lymphocytes | Mean corpuscular hemoglobin concentration (MCHC) | Leukocytes | Basophils | Mean corpuscular hemoglobin (MCH) | Eosinophils | Mean corpuscular volume (MCV) | Monocytes | Red blood cell distribution width (RDW) | Respiratory Syncytial Virus | Influenza A | Influenza B | Parainfluenza 1 | CoronavirusNL63 | Rhinovirus/Enterovirus | Coronavirus HKU1 | Parainfluenza 3 | Chlamydophila pneumoniae | Adenovirus | Parainfluenza 4 | Coronavirus229E | CoronavirusOC43 | Inf A H1N1 2009 | Bordetella pertussis | Metapneumovirus | Parainfluenza 2 | Influenza B, rapid test | Influenza A, rapid test | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 17 | negative | 0 | 0 | 0 | 0.236515 | -0.022340 | -0.517413 | 0.010677 | 0.102004 | 0.318366 | -0.950790 | -0.094610 | -0.223767 | -0.292269 | 1.482158 | 0.166192 | 0.357547 | -0.625073 | not_detected | not_detected | not_detected | not_detected | not_detected | detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | negative | negative |
| 2 | 8 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 5 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 15 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | not_detected | not_detected | not_detected | not_detected | not_detected | detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5639 | 3 | positive | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5640 | 17 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5641 | 4 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5642 | 10 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5643 | 19 | positive | 0 | 0 | 0 | 0.694287 | 0.541564 | -0.906829 | -0.325903 | 0.578024 | -0.295726 | -0.353319 | -1.288428 | -1.140144 | -0.135455 | -0.835508 | 0.025985 | 0.567652 | -0.182790 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5644 rows × 38 columns
-Nous observons que notre dataset contient 5086 cas negatifs et 558 cas positifs
-Nous pouvons affirmer que nos classes ne sont pas equilibrées
-Alors il faudra utiliser des metriques comme le "Score F1", "la sensibilité" ou "la precision"
df['SARS-Cov-2 exam result'].value_counts()
negative 5086 positive 558 Name: SARS-Cov-2 exam result, dtype: int64
-Nous avons fait cette partie pour voir la distribution de nos differentes variables
-La première chose que nous voyons c'est que toutes nos courbes sont centrées en zeros et un ecart-type egale à 1 ce qui nous laisse comprendre que ces données ont été standardisées.
-Nous pouvons aussi voir que pas mal de ces variables suivent une distribution normale mais pas toute comme la variable "Eosinophils"
for col in df.select_dtypes('float'):
plt.figure()
sns.distplot(df[col])
/Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
-Nous voyons que nos variables qualitatives sont tous binaires soit positif ou negatif ou soit detected ou non detected
for col in df.select_dtypes('object'):
plt.figure()
df[col].value_counts().plot.pie()
-En consultant le docteur nous avons vu que nous pouvons regrouper nos données en deux types: "Test viral" et "Taux sanguins.
c'est de voir variable après variable s'il y a une difference entre les cas positifs et negatifs à travers ça nous pouvons conclure qu'elle variable est significative ou pas.
-Pour l'hematocrit et l'hemoglobine on ne voit rien de speciale
-Sur les platelets nous constatons que les personnes atteintes du covid-19 ont des taux de platelets differents de celles qui sont negatives alors la conclusion que nous pouvons tirer c'est que les platelets sont significatives.
-Sur les leukocytes on a aussi une difference -> variable significative
-Sur les Mean platelet volume , red blood cell, Lymphocytes on ne voit pas difference majeur
-Sur les Basophils on observe une difference -> variable significative
-Sur les Eosinophils on observe une difference -> variable significative
-Sur les Monocytes on observe une difference -> variable significative
Pour etre plus rigoureux nous allons faire un test de significativité pour confirmer nos dits.
positive_df = df[df['SARS-Cov-2 exam result'] == 'positive']
negative_df = df[df['SARS-Cov-2 exam result'] == 'negative']
Creation des sous ensembles blood et viral
missing_rate = df.isna().sum()/df.shape[0]
blood_columns = df.columns[(missing_rate < 0.9) & (missing_rate >0.88)]
viral_columns = df.columns[(missing_rate < 0.88) & (missing_rate > 0.75)]
for col in blood_columns:
plt.figure()
sns.distplot(positive_df[col], label='positive')
sns.distplot(negative_df[col], label='negative')
plt.legend()
/Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
En analysant la matrice de correlation on collecte beaucoup d'information qui vont nous aider à prendre des decisions.
-Nous observons que nos variables Hematocrit et hemoglobines sont très correlées donc nous pouvons prendre la decision d'eliminer un des deux. Mais pour etre sur d'eliminer le bon nous allons une fois de suite consulter le docteur pour qu'il nous donne son avis d'expert la dessus.
-les variables "Mean corpuscular volume et Mean corpuscular hemoglobin" sont correlées aussi donc on va appliquer le meme principe
sns.pairplot(df[blood_columns])
<seaborn.axisgrid.PairGrid at 0x7f892a033b50>
sns.heatmap(df[blood_columns].corr())
<AxesSubplot:>
df[blood_columns].count()
Hematocrit 603 Hemoglobin 603 Platelets 602 Mean platelet volume 599 Red blood Cells 602 Lymphocytes 602 Mean corpuscular hemoglobin concentration (MCHC) 602 Leukocytes 602 Basophils 602 Mean corpuscular hemoglobin (MCH) 602 Eosinophils 602 Mean corpuscular volume (MCV) 602 Monocytes 601 Red blood cell distribution width (RDW) 602 dtype: int64
df[viral_columns].count()
Respiratory Syncytial Virus 1354 Influenza A 1354 Influenza B 1354 Parainfluenza 1 1352 CoronavirusNL63 1352 Rhinovirus/Enterovirus 1352 Coronavirus HKU1 1352 Parainfluenza 3 1352 Chlamydophila pneumoniae 1352 Adenovirus 1352 Parainfluenza 4 1352 Coronavirus229E 1352 CoronavirusOC43 1352 Inf A H1N1 2009 1352 Bordetella pertussis 1352 Metapneumovirus 1352 Parainfluenza 2 1352 Influenza B, rapid test 820 Influenza A, rapid test 820 dtype: int64
Notre objectif dans ce cas est de voir si nous devons supprimer ces trois variables ou non : -Patient addmited to regular ward (1=yes, 0=no) -Patient addmited to semi-intensive unit -Patient addmited to intensive care unit (1=yes, 0=no)
En faisant un displot on arrive à la conclusion de garder ces trois variables car on peut voir qu'elles sont significatives.
plt.figure()
sns.distplot(positive_df['Patient addmited to regular ward (1=yes, 0=no)'], label='positive')
sns.distplot(negative_df['Patient addmited to regular ward (1=yes, 0=no)'], label='negative')
plt.legend()
/Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<matplotlib.legend.Legend at 0x7f892ada73a0>
plt.figure()
sns.distplot(positive_df['Patient addmited to semi-intensive unit (1=yes, 0=no)'], label='positive')
sns.distplot(negative_df['Patient addmited to semi-intensive unit (1=yes, 0=no)'], label='negative')
plt.legend()
/Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<matplotlib.legend.Legend at 0x7f892ae03640>
Notre but est de voir si nos variables de type "test viral" sont significatifs
En comparant notre target avec les variables de type "test viral" nous observons que nos variables n'ont aucun lien avec le covid-19 donc ils ne sont pas significatives mais attention nous avons jugé necessaire de verifier cette hypothèse dans la partie pre-processing. On va appliquer un modèle en gardant ces variables et appliquer un modèle en les enlevant et par la suite comparer les scores pour voir si ce qu'on dit est vrai ou faux.
for col in viral_columns:
plt.figure()
sns.heatmap(pd.crosstab(df['SARS-Cov-2 exam result'],df[col]),annot=True,cmap='crest',fmt='d')
plt.figure()
sns.heatmap(pd.crosstab(df['SARS-Cov-2 exam result'],df['Influenza A']),annot=True,cmap='crest',fmt='d')
<AxesSubplot:xlabel='Influenza A', ylabel='SARS-Cov-2 exam result'>
-Il est difficile pour nous d'interpreter pourquoi l'age dans notre cas est entre 0 et 19 alors qu'en general l'age est entre 1 et 100 ans et malheureusement la personne qui a posté ce dataset sur kaggle n'a laissé aucune information sur la variable "Patient age quantile" donc tout ce que nous pouvons faire c'est emettre des hypothèses. En jettant un coup d'oeil dans la discussion sur kaggle on s'est aperçu que beaucoup de personnes ont été confronté à ce problème aussi donc l'hypothèse qu'ils ont emis c'est de voir ces chiffres comme des tranches d'age: 1 [0 5] 2 [6 10] 3 [11 15] 4 [16 20] 5 [21 25] 6 [26 30] ......
-Grace à notre fonction countplot nous avons pu dementir une information qui circulait en disant que les enfants ne peuvent pas etre atteint du covid-19 nous avons belle et bien vu qu'enfant comme adulte sont exposés à cette maladie. l'age n'a aucune importance.
-En revanche en se basant sur notre hypothèse sur l'age nous observons que les individus de faible age sont tres peu contaminés.
sns.countplot(data=df,x='Patient age quantile',hue='SARS-Cov-2 exam result')
<AxesSubplot:xlabel='Patient age quantile', ylabel='count'>
Le test de student permet de verifier si la moyenne entre deux distributions est significativement differente.
Les individus atteints du covid-19 ont des taux de Leukocytes, Monocytes, Platelets significativement différents
H0 = Les taux moyens sont ÉGAUX chez les individus positifs et négatifs
Nous pouvons conclure que la decision que la decision qu'on avait prise tout en haut avec le graphique est bien fondé car le test vient de nous le prouver.
balanced_neg = negative_df.sample(positive_df.shape[0])
def t_test(col):
alpha = 0.02
stat, p = ttest_ind(balanced_neg[col].dropna(), positive_df[col].dropna())
if p < alpha:
return 'H0 Rejetée'
else :
return 0
for col in blood_columns:
print(f'{col :-<50} {t_test(col)}')
Hematocrit---------------------------------------- H0 Rejetée Hemoglobin---------------------------------------- H0 Rejetée Platelets----------------------------------------- H0 Rejetée Mean platelet volume ----------------------------- H0 Rejetée Red blood Cells----------------------------------- H0 Rejetée Lymphocytes--------------------------------------- 0 Mean corpuscular hemoglobin concentration (MCHC)-- 0 Leukocytes---------------------------------------- H0 Rejetée Basophils----------------------------------------- 0 Mean corpuscular hemoglobin (MCH)----------------- 0 Eosinophils--------------------------------------- H0 Rejetée Mean corpuscular volume (MCV)--------------------- 0 Monocytes----------------------------------------- H0 Rejetée Red blood cell distribution width (RDW)----------- 0
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,BaggingClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler,RobustScaler
from imblearn.over_sampling import ADASYN, SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,accuracy_score,classification_report,confusion_matrix,precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import learning_curve
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import shap
import xgboost
blood_columns = list(df.columns[(missing_rate < 0.9) & (missing_rate >0.88)])
viral_columns = list(df.columns[(missing_rate < 0.80) & (missing_rate > 0.75)])
Suppression de tous mes variables virales comme convenu dans la partie EDA
key_columns = ['Patient age quantile', 'SARS-Cov-2 exam result']
ds = df[key_columns + blood_columns]
df_imp = ds.copy()
def encode(df):
code = {
'positive' : 1,
'negative' : 0
}
for col in df.select_dtypes('object'):
df.loc[:,col] = df[col].map(code)
return df
# imputing median values on the missing values
neg = df_imp[df_imp['SARS-Cov-2 exam result']=='negative']
neg.fillna(neg.median(), inplace=True)
pos = df_imp[df_imp['SARS-Cov-2 exam result']=='positive']
pos.fillna(pos.median(), inplace= True)
Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction. A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction. A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_median = pd.concat([neg,pos])
df_median
| Patient age quantile | SARS-Cov-2 exam result | Hematocrit | Hemoglobin | Platelets | Mean platelet volume | Red blood Cells | Lymphocytes | Mean corpuscular hemoglobin concentration (MCHC) | Leukocytes | Basophils | Mean corpuscular hemoglobin (MCH) | Eosinophils | Mean corpuscular volume (MCV) | Monocytes | Red blood cell distribution width (RDW) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13 | negative | 0.019074 | -0.022340 | 0.022745 | -0.101517 | -0.003778 | 0.002791 | -0.054585 | -0.080696 | -0.223767 | 0.125903 | -0.245556 | 0.086074 | -0.220244 | -0.182790 |
| 1 | 17 | negative | 0.236515 | -0.022340 | -0.517413 | 0.010677 | 0.102004 | 0.318366 | -0.950790 | -0.094610 | -0.223767 | -0.292269 | 1.482158 | 0.166192 | 0.357547 | -0.625073 |
| 2 | 8 | negative | 0.019074 | -0.022340 | 0.022745 | -0.101517 | -0.003778 | 0.002791 | -0.054585 | -0.080696 | -0.223767 | 0.125903 | -0.245556 | 0.086074 | -0.220244 | -0.182790 |
| 3 | 5 | negative | 0.019074 | -0.022340 | 0.022745 | -0.101517 | -0.003778 | 0.002791 | -0.054585 | -0.080696 | -0.223767 | 0.125903 | -0.245556 | 0.086074 | -0.220244 | -0.182790 |
| 4 | 15 | negative | 0.019074 | -0.022340 | 0.022745 | -0.101517 | -0.003778 | 0.002791 | -0.054585 | -0.080696 | -0.223767 | 0.125903 | -0.245556 | 0.086074 | -0.220244 | -0.182790 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5632 | 16 | positive | 0.350958 | 0.416252 | -0.718402 | 0.235063 | 0.278308 | -0.073970 | 0.144572 | -0.834833 | -0.223767 | 0.125903 | -0.666950 | -0.054132 | 0.515126 | -0.271247 |
| 5633 | 4 | positive | 0.350958 | 0.416252 | -0.718402 | 0.235063 | 0.278308 | -0.073970 | 0.144572 | -0.834833 | -0.223767 | 0.125903 | -0.666950 | -0.054132 | 0.515126 | -0.271247 |
| 5634 | 15 | positive | 0.350958 | 0.416252 | -0.718402 | 0.235063 | 0.278308 | -0.073970 | 0.144572 | -0.834833 | -0.223767 | 0.125903 | -0.666950 | -0.054132 | 0.515126 | -0.271247 |
| 5639 | 3 | positive | 0.350958 | 0.416252 | -0.718402 | 0.235063 | 0.278308 | -0.073970 | 0.144572 | -0.834833 | -0.223767 | 0.125903 | -0.666950 | -0.054132 | 0.515126 | -0.271247 |
| 5643 | 19 | positive | 0.694287 | 0.541564 | -0.906829 | -0.325903 | 0.578024 | -0.295726 | -0.353319 | -1.288428 | -1.140144 | -0.135455 | -0.835508 | 0.025985 | 0.567652 | -0.182790 |
5644 rows × 16 columns
df_median.isna().sum().sort_values(ascending=False)
Patient age quantile 0 SARS-Cov-2 exam result 0 Hematocrit 0 Hemoglobin 0 Platelets 0 Mean platelet volume 0 Red blood Cells 0 Lymphocytes 0 Mean corpuscular hemoglobin concentration (MCHC) 0 Leukocytes 0 Basophils 0 Mean corpuscular hemoglobin (MCH) 0 Eosinophils 0 Mean corpuscular volume (MCV) 0 Monocytes 0 Red blood cell distribution width (RDW) 0 dtype: int64
dp = df_median.copy()
encode(df_median)
| Patient age quantile | SARS-Cov-2 exam result | Hematocrit | Hemoglobin | Platelets | Mean platelet volume | Red blood Cells | Lymphocytes | Mean corpuscular hemoglobin concentration (MCHC) | Leukocytes | Basophils | Mean corpuscular hemoglobin (MCH) | Eosinophils | Mean corpuscular volume (MCV) | Monocytes | Red blood cell distribution width (RDW) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13 | 0 | 0.019074 | -0.022340 | 0.022745 | -0.101517 | -0.003778 | 0.002791 | -0.054585 | -0.080696 | -0.223767 | 0.125903 | -0.245556 | 0.086074 | -0.220244 | -0.182790 |
| 1 | 17 | 0 | 0.236515 | -0.022340 | -0.517413 | 0.010677 | 0.102004 | 0.318366 | -0.950790 | -0.094610 | -0.223767 | -0.292269 | 1.482158 | 0.166192 | 0.357547 | -0.625073 |
| 2 | 8 | 0 | 0.019074 | -0.022340 | 0.022745 | -0.101517 | -0.003778 | 0.002791 | -0.054585 | -0.080696 | -0.223767 | 0.125903 | -0.245556 | 0.086074 | -0.220244 | -0.182790 |
| 3 | 5 | 0 | 0.019074 | -0.022340 | 0.022745 | -0.101517 | -0.003778 | 0.002791 | -0.054585 | -0.080696 | -0.223767 | 0.125903 | -0.245556 | 0.086074 | -0.220244 | -0.182790 |
| 4 | 15 | 0 | 0.019074 | -0.022340 | 0.022745 | -0.101517 | -0.003778 | 0.002791 | -0.054585 | -0.080696 | -0.223767 | 0.125903 | -0.245556 | 0.086074 | -0.220244 | -0.182790 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5632 | 16 | 1 | 0.350958 | 0.416252 | -0.718402 | 0.235063 | 0.278308 | -0.073970 | 0.144572 | -0.834833 | -0.223767 | 0.125903 | -0.666950 | -0.054132 | 0.515126 | -0.271247 |
| 5633 | 4 | 1 | 0.350958 | 0.416252 | -0.718402 | 0.235063 | 0.278308 | -0.073970 | 0.144572 | -0.834833 | -0.223767 | 0.125903 | -0.666950 | -0.054132 | 0.515126 | -0.271247 |
| 5634 | 15 | 1 | 0.350958 | 0.416252 | -0.718402 | 0.235063 | 0.278308 | -0.073970 | 0.144572 | -0.834833 | -0.223767 | 0.125903 | -0.666950 | -0.054132 | 0.515126 | -0.271247 |
| 5639 | 3 | 1 | 0.350958 | 0.416252 | -0.718402 | 0.235063 | 0.278308 | -0.073970 | 0.144572 | -0.834833 | -0.223767 | 0.125903 | -0.666950 | -0.054132 | 0.515126 | -0.271247 |
| 5643 | 19 | 1 | 0.694287 | 0.541564 | -0.906829 | -0.325903 | 0.578024 | -0.295726 | -0.353319 | -1.288428 | -1.140144 | -0.135455 | -0.835508 | 0.025985 | 0.567652 | -0.182790 |
5644 rows × 16 columns
df_median['SARS-Cov-2 exam result'].value_counts()
0 5086 1 558 Name: SARS-Cov-2 exam result, dtype: int64
X = dp.drop('SARS-Cov-2 exam result', axis=1)
y = df_median['SARS-Cov-2 exam result']
Vu que nous avons un desequilibre de classe nous avons decidé une technique de réequilibre qui est le SMOTE. SMOTE est une technique de suréchantillonnage qui génère des échantillons synthétiques à partir de la classe minoritaire . Il est utilisé pour obtenir un ensemble d'apprentissage synthétiquement équilibré en classe ou presque équilibré en classe, qui est ensuite utilisé pour former le classifieur
X, y = SMOTE().fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
model_1 = make_pipeline(StandardScaler(),AdaBoostClassifier())
model_1.fit(X_train,y_train)
pred = model_1.predict(X_test)
print(classification_report(y_test,pred))
precision recall f1-score support
0 0.99 0.98 0.99 1039
1 0.98 0.99 0.98 996
accuracy 0.98 2035
macro avg 0.98 0.98 0.98 2035
weighted avg 0.98 0.98 0.98 2035
model_1.score(X_test,y_test)
0.9809960681520314
model_1 = make_pipeline(MinMaxScaler(),AdaBoostClassifier())
model_1.fit(X_train,y_train)
pred = model_1.predict(X_test)
print(classification_report(y_test,pred))
precision recall f1-score support
0 0.98 0.98 0.98 1551
1 0.98 0.98 0.98 1501
accuracy 0.98 3052
macro avg 0.98 0.98 0.98 3052
weighted avg 0.98 0.98 0.98 3052
model_1 = make_pipeline(RobustScaler(),AdaBoostClassifier())
model_1.fit(X_train,y_train)
pred = model_1.predict(X_test)
print(classification_report(y_test,pred))
precision recall f1-score support
0 0.98 0.98 0.98 1551
1 0.98 0.98 0.98 1501
accuracy 0.98 3052
macro avg 0.98 0.98 0.98 3052
weighted avg 0.98 0.98 0.98 3052
cm = confusion_matrix(y_test, pred)
class_labels = ['Classe 0', 'Classe 1']
sns.heatmap(cm, annot=True, fmt="d", xticklabels=class_labels, yticklabels=class_labels)
plt.title('Matrice de confusion')
plt.xlabel('Prédictions')
plt.ylabel('Vraies étiquettes')
plt.show()
Dans ce cas, les scores de validation croisée sont élevés et relativement similaires pour chaque fold, ce qui suggère une bonne performance du modèle sur les sous-ensembles d'entraînement et de validation. Le score moyen de validation croisée est également élevé (environ 0.984), indiquant une performance globalement solide du modèle.
Les métriques d'évaluation sur l'ensemble de validation montrent également de bons résultats. L'accuracy (exactitude) est élevée (environ 0.985), ce qui signifie que le modèle prédit correctement la classe de la plupart des échantillons. La precision (précision) est également élevée (environ 0.980), ce qui indique un faible taux de faux positifs. Le recall (rappel) est élevé (environ 0.990), ce qui suggère un faible taux de faux négatifs. Le F1-score est également élevé (environ 0.985), ce qui représente une bonne harmonie entre la précision et le rappel.
Le score sur l'ensemble de test est également élevé (environ 0.984), ce qui confirme la performance générale du modèle.
Dans l'ensemble, ces résultats indiquent que le modèle a une bonne capacité à généraliser et à prédire avec précision sur de nouvelles données. Donc notre modèle n'est pas en overfitting
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)
model = AdaBoostClassifier()
cross_val_scores = cross_val_score(model, X_train, y_train, cv=5)
# Calculer la moyenne des scores de validation croisée
mean_score = cross_val_scores.mean()
# Afficher les scores de validation croisée pour chaque fold
print("Scores de validation croisée :")
for i, score in enumerate(cross_val_scores):
print(f"Fold {i+1}: {score}")
# Afficher le score moyen
print("Score moyen : ", mean_score)
# Entraîner le modèle sur l'ensemble d'entraînement complet
model.fit(X_train, y_train)
# Prédictions sur l'ensemble de validation
val_predictions = model.predict(X_val)
# Calcul des métriques d'évaluation sur l'ensemble de validation
val_accuracy = accuracy_score(y_val, val_predictions)
val_precision = precision_score(y_val, val_predictions)
val_recall = recall_score(y_val, val_predictions)
val_f1 = f1_score(y_val, val_predictions)
# Afficher les métriques d'évaluation sur l'ensemble de validation
print("Métriques d'évaluation sur l'ensemble de validation:")
print("Accuracy:", val_accuracy)
print("Precision:", val_precision)
print("Recall:", val_recall)
print("F1-score:", val_f1)
# Évaluer la performance du modèle sur l'ensemble de test
test_predictions = model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
# Afficher le score sur l'ensemble de test
print("Score sur l'ensemble de test :", test_accuracy)
Scores de validation croisée : Fold 1: 0.984029484029484 Fold 2: 0.984029484029484 Fold 3: 0.9852489244007375 Fold 4: 0.9803318992009834 Fold 5: 0.986478180700676 Score moyen : 0.984023594472273 Métriques d'évaluation sur l'ensemble de validation: Accuracy: 0.9852507374631269 Precision: 0.9801587301587301 Recall: 0.9899799599198397 F1-score: 0.9850448654037885 Score sur l'ensemble de test : 0.9842829076620825
feature_importances = model_1.named_steps['adaboostclassifier'].feature_importances_
pd.DataFrame(feature_importances, index=X_train.columns).plot.bar(figsize=(8,6))
<AxesSubplot:>
model_2 = make_pipeline(StandardScaler(),RandomForestClassifier(random_state=0))
model_2.fit(X_train,y_train)
pred = model_2.predict(X_test)
print(classification_report(y_test,pred))
precision recall f1-score support
0 1.00 0.99 0.99 1551
1 0.99 1.00 0.99 1501
accuracy 0.99 3052
macro avg 0.99 0.99 0.99 3052
weighted avg 0.99 0.99 0.99 3052
model_2.score(X_test,y_test)
0.9931192660550459
model_2 = make_pipeline(MinMaxScaler(),RandomForestClassifier(random_state=0))
model_2.fit(X_train,y_train)
pred = model_2.predict(X_test)
print(classification_report(y_test,pred))
precision recall f1-score support
0 1.00 0.99 0.99 1551
1 0.99 1.00 0.99 1501
accuracy 0.99 3052
macro avg 0.99 0.99 0.99 3052
weighted avg 0.99 0.99 0.99 3052
model_2 = make_pipeline(RobustScaler(),RandomForestClassifier(random_state=0))
model_2.fit(X_train,y_train)
pred = model_2.predict(X_test)
print(classification_report(y_test,pred))
precision recall f1-score support
0 1.00 0.99 0.99 1551
1 0.99 1.00 0.99 1501
accuracy 0.99 3052
macro avg 0.99 0.99 0.99 3052
weighted avg 0.99 0.99 0.99 3052
cm = confusion_matrix(y_test, pred)
class_labels = ['Classe 0', 'Classe 1']
sns.heatmap(cm, annot=True, fmt="d", xticklabels=class_labels, yticklabels=class_labels)
plt.title('Matrice de confusion')
plt.xlabel('Prédictions')
plt.ylabel('Vraies étiquettes')
plt.show()
Les scores de validation croisée et les métriques d'évaluation sur l'ensemble de validation et de testindiquent des performances élevées et une absence apparente d'overfitting. -Scores de validation croisée : Les scores de validation croisée sont tous élevés, variant entre 0.989 et 0.995. Cela suggère que le modèle généralise bien les données et n'est pas trop ajusté aux données d'entraînement spécifiques.
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)
model = RandomForestClassifier(random_state=0)
cross_val_scores = cross_val_score(model, X_train, y_train, cv=5)
# Calculer la moyenne des scores de validation croisée
mean_score = cross_val_scores.mean()
# Afficher les scores de validation croisée pour chaque fold
print("Scores de validation croisée :")
for i, score in enumerate(cross_val_scores):
print(f"Fold {i+1}: {score}")
# Afficher le score moyen
print("Score moyen : ", mean_score)
# Entraîner le modèle sur l'ensemble d'entraînement complet
model.fit(X_train, y_train)
# Prédictions sur l'ensemble de validation
val_predictions = model.predict(X_val)
# Calcul des métriques d'évaluation sur l'ensemble de validation
val_accuracy = accuracy_score(y_val, val_predictions)
val_precision = precision_score(y_val, val_predictions)
val_recall = recall_score(y_val, val_predictions)
val_f1 = f1_score(y_val, val_predictions)
# Afficher les métriques d'évaluation sur l'ensemble de validation
print("Métriques d'évaluation sur l'ensemble de validation:")
print("Accuracy:", val_accuracy)
print("Precision:", val_precision)
print("Recall:", val_recall)
print("F1-score:", val_f1)
# Évaluer la performance du modèle sur l'ensemble de test
test_predictions = model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
# Afficher le score sur l'ensemble de test
print("Score sur l'ensemble de test :", test_accuracy)
Scores de validation croisée : Fold 1: 0.9895577395577395 Fold 2: 0.9957002457002457 Fold 3: 0.9938537185003073 Fold 4: 0.9944683466502766 Fold 5: 0.9950829748002459 Score moyen : 0.993732605041763 Métriques d'évaluation sur l'ensemble de validation: Accuracy: 0.9960668633235005 Precision: 0.9940119760479041 Recall: 0.9979959919839679 F1-score: 0.996 Score sur l'ensemble de test : 0.9980353634577603
feature_importances = model_2.named_steps['randomforestclassifier'].feature_importances_
pd.DataFrame(feature_importances, index=X_train.columns).plot.bar(figsize=(8,6))
<AxesSubplot:>
model_3 = make_pipeline(RobustScaler(),xgboost.XGBClassifier(n_estimators=200, max_depth=4))
model_3.fit(X_train,y_train)
pred = model_3.predict(X_test)
print(classification_report(y_test,pred))
precision recall f1-score support
0 1.00 0.99 0.99 1551
1 0.99 1.00 0.99 1501
accuracy 0.99 3052
macro avg 0.99 0.99 0.99 3052
weighted avg 0.99 0.99 0.99 3052
model_3 = make_pipeline(MinMaxScaler(),xgboost.XGBClassifier(n_estimators=200, max_depth=4))
model_3.fit(X_train,y_train)
pred = model_3.predict(X_test)
print(classification_report(y_test,pred))
precision recall f1-score support
0 1.00 0.99 0.99 1551
1 0.99 1.00 0.99 1501
accuracy 0.99 3052
macro avg 0.99 0.99 0.99 3052
weighted avg 0.99 0.99 0.99 3052
model_3 = make_pipeline(StandardScaler(),xgboost.XGBClassifier(n_estimators=200, max_depth=4))
model_3.fit(X_train,y_train)
pred = model_3.predict(X_test)
print(classification_report(y_test,pred))
precision recall f1-score support
0 0.99 0.99 0.99 521
1 0.99 0.99 0.99 497
accuracy 0.99 1018
macro avg 0.99 0.99 0.99 1018
weighted avg 0.99 0.99 0.99 1018
cm = confusion_matrix(y_test, pred)
class_labels = ['Classe 0', 'Classe 1']
sns.heatmap(cm, annot=True, fmt="d", xticklabels=class_labels, yticklabels=class_labels)
plt.title('Matrice de confusion')
plt.xlabel('Prédictions')
plt.ylabel('Vraies étiquettes')
plt.show()
Ces résultats suggèrent que le modèle XGBoost généralise bien les données et n'est pas en l'overfitting
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)
model = xgboost.XGBClassifier(n_estimators=200, max_depth=4)
cross_val_scores = cross_val_score(model, X_train, y_train, cv=5)
# Calculer la moyenne des scores de validation croisée
mean_score = cross_val_scores.mean()
# Afficher les scores de validation croisée pour chaque fold
print("Scores de validation croisée :")
for i, score in enumerate(cross_val_scores):
print(f"Fold {i+1}: {score}")
# Afficher le score moyen
print("Score moyen : ", mean_score)
# Entraîner le modèle sur l'ensemble d'entraînement complet
model.fit(X_train, y_train)
# Prédictions sur l'ensemble de validation
val_predictions = model.predict(X_val)
# Calcul des métriques d'évaluation sur l'ensemble de validation
val_accuracy = accuracy_score(y_val, val_predictions)
val_precision = precision_score(y_val, val_predictions)
val_recall = recall_score(y_val, val_predictions)
val_f1 = f1_score(y_val, val_predictions)
# Afficher les métriques d'évaluation sur l'ensemble de validation
print("Métriques d'évaluation sur l'ensemble de validation:")
print("Accuracy:", val_accuracy)
print("Precision:", val_precision)
print("Recall:", val_recall)
print("F1-score:", val_f1)
# Évaluer la performance du modèle sur l'ensemble de test
test_predictions = model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
# Afficher le score sur l'ensemble de test
print("Score sur l'ensemble de test :", test_accuracy)
Scores de validation croisée : Fold 1: 0.9907862407862408 Fold 2: 0.9957002457002457 Fold 3: 0.9913952059004303 Fold 4: 0.9920098340503996 Fold 5: 0.9932390903503381 Score moyen : 0.9926261233575309 Métriques d'évaluation sur l'ensemble de validation: Accuracy: 0.992133726647001 Precision: 0.9900199600798403 Recall: 0.9939879759519038 F1-score: 0.9919999999999999 Score sur l'ensemble de test : 0.9921414538310412
feature_importances = model_3.named_steps['xgbclassifier'].feature_importances_
pd.DataFrame(feature_importances, index=X_train.columns).plot.bar(figsize=(8,6))
<AxesSubplot:>
model_4 = make_pipeline(StandardScaler(),SVC(kernel='poly'))
model_4.fit(X_train,y_train)
pred = model_4.predict(X_test)
print(classification_report(y_test,pred))
precision recall f1-score support
0 1.00 0.98 0.99 1551
1 0.98 1.00 0.99 1501
accuracy 0.99 3052
macro avg 0.99 0.99 0.99 3052
weighted avg 0.99 0.99 0.99 3052
model_4 = make_pipeline(RobustScaler(),SVC(kernel='poly'))
model_4.fit(X_train,y_train)
pred = model_4.predict(X_test)
print(classification_report(y_test,pred))
precision recall f1-score support
0 0.99 0.98 0.98 1551
1 0.98 0.99 0.98 1501
accuracy 0.98 3052
macro avg 0.98 0.98 0.98 3052
weighted avg 0.98 0.98 0.98 3052
cm = confusion_matrix(y_test, pred)
class_labels = ['Classe 0', 'Classe 1']
sns.heatmap(cm, annot=True, fmt="d", xticklabels=class_labels, yticklabels=class_labels)
plt.title('Matrice de confusion')
plt.xlabel('Prédictions')
plt.ylabel('Vraies étiquettes')
plt.show()
scaler2 = StandardScaler()
X_train = scaler2.fit_transform(X_train)
X_test = scaler2.transform(X_test)
parameters = {
'n_estimators':np.arange(1150,1250,25),
'learning_rate':[0.0001, 0.001, 0.01, 0.1, 1.0]}
grid = GridSearchCV(estimator = AdaBoostClassifier(),
param_grid = parameters,
cv = 5,
verbose=0)
grid.fit(X_train,y_train)
print('GridSearch CV meilleur score : {:.4f}\n\n'.format(grid.best_score_))
# print les paramètres qui donnent les meilleurs résultats
print('Les parameters qui donnent les meilleurs resultats :','\n\n', (grid.best_params_))
GridSearch CV meilleur score : 0.9878
Les parameters qui donnent les meilleurs resultats :
{'learning_rate': 0.1, 'n_estimators': 1150}
model1_1 = grid_search_cv.best_estimator_
model1_1.score(X_test,y_test)
0.991480996068152
print(classification_report(y_test,model1_1.predict(X_test)))
precision recall f1-score support
0 0.99 0.99 0.99 1551
1 0.99 0.99 0.99 1501
accuracy 0.99 3052
macro avg 0.99 0.99 0.99 3052
weighted avg 0.99 0.99 0.99 3052
scaler2 = StandardScaler()
X_train = scaler2.fit_transform(X_train)
X_test = scaler2.transform(X_test)
parameters = {
'n_estimators': [200,250,300,350,400, 500],
'max_depth' : [4,5,6,7,8,9,10],
'criterion' :['gini', 'entropy']
}
grid = GridSearchCV(estimator = RandomForestClassifier(),
param_grid = parameters,
cv = 5,
verbose=0)
grid.fit(X_train,y_train)
print('GridSearch CV meilleur score : {:.4f}\n\n'.format(grid.best_score_))
# print les paramètres qui donnent les meilleurs résultats
print('Les parameters qui donnent les meilleurs resultats :','\n\n', (grid.best_params_))
GridSearch CV meilleur score : 0.9934
Les parameters qui donnent les meilleurs resultats :
{'criterion': 'gini', 'max_depth': 8, 'n_estimators': 400}
model2_2 = grid.best_estimator_
model2_2.score(X_test,y_test)
0.9918086500655308
scaler2 = StandardScaler()
X_train = scaler2.fit_transform(X_train)
X_test = scaler2.transform(X_test)
parameters = {
'learning_rate': [0.1, 0.5, 1.0],
'n_estimators': [100, 200, 300],
'max_depth': [3, 5, 7]
}
xgb = xgboost.XGBClassifier()
grid = GridSearchCV(estimator = xgb,
param_grid = parameters,
cv = 5,
verbose=0)
grid.fit(X_train,y_train)
GridSearchCV(cv=5,
estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None, colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
feature_types=None, gamma=None,
gpu_id=None, grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=None, max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None,
random_state=None, ...),
param_grid={'learning_rate': [0.1, 0.5, 1.0],
'max_depth': [3, 5, 7],
'n_estimators': [100, 200, 300]})
print('GridSearch CV meilleur score : {:.4f}\n\n'.format(grid.best_score_))
# print les paramètres qui donnent les meilleurs résultats
print('Les parameters qui donnent les meilleurs resultats :','\n\n', (grid.best_params_))
GridSearch CV meilleur score : 0.9933
Les parameters qui donnent les meilleurs resultats :
{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
model3_1 = grid.best_estimator_
model3_1.score(X_test,y_test)
0.9918086500655308
scaler2 = StandardScaler()
X_train = scaler2.fit_transform(X_train)
X_test = scaler2.transform(X_test)
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
parameters = [ {'clf__C': param_range,'clf__kernel': ['linear']},
{'clf__C': param_range,'clf__gamma': param_range,
'clf__kernel': ['rbf']}
]
grid = GridSearchCV(estimator = SVC(),
param_grid = parameters,
scoring = 'accuracy',
cv = 5,
verbose=0)
grid.fit(X_train,y_train)
# print les paramètres qui donnent les meilleurs résultats
print('Parameters that give the best results :','\n\n', (grid.best_params_))
# print Estimateur choisi par la recherche
print('\n\nEstimator that was chosen by the search :','\n\n', (grid.best_estimator_))
model4_1 = grid.best_estimator_
model4_1.score(X_test,y_test)
ada_probs = model1_1.predict_proba(X_test)
rf_probs = model2_2.predict_proba(X_test)
xgb_probs = model3_1.predict_proba(X_test)
#svm_probs = model_4.predict_proba(X_test)
ada_probs = ada_probs[:, 1]
rf_probs = rf_probs[:, 1]
xgb_probs = xgb_probs[:, 1]
#svm_probs = svm_probs[:, 1]
from sklearn.metrics import roc_curve, roc_auc_score
rf_auc = roc_auc_score(y_test, rf_probs)
ada_auc = roc_auc_score(y_test, ada_probs)
xgb_auc = roc_auc_score(y_test, xgb_probs)
#svm_auc = roc_auc_score(y_test, svm_probs)
rf_auc = roc_auc_score(y_test, rf_probs)
ada_auc = roc_auc_score(y_test, ada_probs)
xgb_auc = roc_auc_score(y_test, xgb_probs)
#svm_auc = roc_auc_score(Y_test, svm_probs)
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_probs)
ada_fpr, ada_tpr, _ = roc_curve(y_test, ada_probs)
xgb_fpr, xgb_tpr, _ = roc_curve(y_test, xgb_probs)
plt.plot(rf_fpr, rf_tpr, marker='.', label='Random Forest (AUROC = %0.3f)' % rf_auc)
plt.plot(ada_fpr, ada_tpr, marker='.', label='AdaBoost (AUROC = %0.3f)' % ada_auc)
plt.plot(xgb_fpr, xgb_tpr, marker='.', label='XGBoost (AUROC = %0.3f)' % xgb_auc)
#plt.plot(svm_fpr, nb_tpr, marker='.', label='SVM (AUROC = %0.3f)' % svm_auc)
# Title
plt.title('ROC Plot')
# Axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# Show legend
plt.legend() #
# Show plot
plt.show()
L'AUC est une mesure de performance utilisée pour évaluer la qualité d'un modèle de classification binaire en se basant sur la courbe ROC. L'AUC représente la probabilité que le modèle classe correctement une instance positive au hasard plus haut que l'instance négative au hasard.
Un AUC de 1 signifie que le modèle est capable de classer parfaitement toutes les instances positives devant toutes les instances négatives, sans aucune erreur de classification. C'est le résultat idéal et indique une performance exceptionnelle du modèle.
Vu que notre modèle est capable de classer parfaitement toutes instances positives devant les instances negatives nous allons choisir le modèle qui donne le bon score